#coding=utf-8
import numpy as np
import matplotlib.pyplot as plt

class TwoLayerNet(object):
    '''
     输入层维度为D,隐藏层的维度为H,类别为C.使用softmax损失和L2正则化.在第一个全连接层后使用relu激活函数
     input - fully connected layer - ReLU - fully connected layer - softmax
    '''
    def __init__(self, input_size, hidden_size, output_size, std=1e-4):
        '''
        初始化模型.权重被初始化为小的随机数,bias被初始化为0.Weights和Bias被存储在self.params中，
        self.params是一个字典，存储的内容如下
        W1: 第一层权重(D, H)
        b1: 第一层偏执(H,)
        W2: 第二层权重(H, C)
        b2: 第二层偏执(C,)
        '''
        self.params = {}
        self.params['W1'] = std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
    def loss(self, X, y=None, reg=0.0):
        '''
        对一个两层的神经网络计算损失函数和梯度
        输入：
        - X : 维度为(N, D)的输入数据. X[i]是一个训练例子
        - y : 训练集标签的一个容器. y[i]是X[i]的标签,0<=y[i]<C，如果Y是None,返回分数.反之返回损失函数和梯度
        -reg : 正则化系数
        '''
        w1, b1 = self.params['W1'], self.params['b1']
        w2, b2 = self.params['W2'], self.params['b2']
        N, D = X.shape
        #前像传播
        scores = None
        h_output = np.maximum(0, X.dot(w1) + b1)
        scores = h_output.dot(w2) + b2
        if y is None:
            return scores
        #计算损失
        loss = None
        shift_scores = scores - np.max(scores, axis=1).reshape(-1, 1)
        softmax_output = np.exp(shift_scores) / np.sum(np.exp(shift_scores), axis=1).reshape(-1, 1)
        loss = -np.sum(np.log(softmax_output[range(N), list(y)]))
        loss /= N
        loss += 0.5 * reg * (np.sum(w1 * w1) + np.sum(w2 * w2))
        grads = {}
        dscores = softmax_output.copy()
        dscores[range(N), list(y)] -= 1
        dscores /= N
        grads['W2'] = h_output.T.dot(dscores) + reg * w2
        grads['b2'] = np.sum(dscores, axis=0)

        dh = dscores.dot(w2.T)
        dh_ReLU = (h_output > 0) * dh
        grads['W1'] = X.T.dot(dh_ReLU) + reg * w1
        grads['b1'] = np.sum(dh_ReLU, axis=0)

        return loss, grads
    def train(self, X, y, X_val, y_val, learning_rate=1e-3, learing_rate_decay=0.95, reg=1e-5, num_iter=100, batch_size=200, verbose=False):
        '''
        使用随机梯度下降法训练2层神经网络
        :param X: 形状为(N,D)的训练数据
        :param y: 形状为(N,)是训练数据的标签
        :param X_val: 形状为(N_val,D)，是交叉验证集
        :param y_val: 形状为(N_val,)是交叉验证集标签
        :param learning_rate: 学习率
        :param learing_rate_decay: 用于衰减学习率的因子
        :param reg: 正则化系数
        :param num_iter: 训练迭代轮数
        :param batch_size: 每次训练数据的数量
        :param verbose:是否打印优化过程
        '''
        num_train = X.shape[0]
        iterations_per_epoch = max(num_train / batch_size, 1)
        #使用SGD优化self.model的参数
        loss_history = []
        train_acc_history = []
        val_acc_history = []
        for it in range(num_iter):
            X_batch = None
            y_batch = None
            idx = np.random.choice(num_train, batch_size, replace=True)
            X_batch = X[idx]
            y_batch = y[idx]
            loss, grads = self.loss(X_batch, y=y_batch, reg=reg)
            loss_history.append(loss)

            self.params['W2'] += -learning_rate * grads['W2']
            self.params['b2'] += learning_rate * grads['b2']
            self.params['W1'] += -learning_rate * grads['W1']
            self.params['b1'] += -learning_rate * grads['b1']

            if verbose and it % 100 == 0:
                print('iteration %d / %d: loss %f'%(it, num_iter, loss))
            #每一个epoch，检查训练集，验证集的准确率
            if it % iterations_per_epoch == 0:
                train_acc = (self.predict(X_batch) == y_batch).mean()
                val_acc = (self.predict(X_val) == y_val).mean()
                train_acc_history.append(train_acc)
                val_acc_history.append(val_acc)
                #更新学习率
                learning_rate *= learing_rate_decay
        return {'loss_history':loss_history, 'train_acc_history':train_acc_history, 'val_acc_history': val_acc_history}
    def predict(self, X):
        '''
        使用训练的权重和这个2层神经网络去预测标签
        :param X: 一个维度为(N,D)的数据
        :return: 标签(0<=c<C)
        '''
        y_pred = None
        h = np.maximum(0, X.dot(self.params['W1']) + self.params['b1'])
        scores = h.dot(self.params['W2']) + self.params['b2']
        y_pred = np.argmax(scores, axis=1)

        return y_pred




